2050s 500-year floodplain

1. Load and Clean 2020 Census Data

# (1) Load demographic data
credential <- Sys.getenv("census_api_key")

lookup_var <- load_variables(2020, "acs5", cache = TRUE)
glimpse(lookup_var)
## Rows: 27,850
## Columns: 3
## $ name    <chr> "B01001_001", "B01001_002", "B01001_003", "B01001_004", "B0100…
## $ label   <chr> "Estimate!!Total:", "Estimate!!Total:!!Male:", "Estimate!!Tota…
## $ concept <chr> "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX B…
race <- c(white = "B02001_002", #race
          black = "B02001_003",
          indian_alaska = "B02001_004",
          asian = "B02001_005",
          pacific = "B02001_006")

education_attainment <- c(no_schooling = "B15003_002", #education_attainment
                          elementary_school = "B15003_009",
                          middle_school = "B15003_012",
                          high_school = "B15003_017",
                          bachelor = "B15003_022")

employment_status <- c(employed = "B23025_004", # employment_status
                       unemployed = "B23025_005",
                       no_in_labor_force = "23025_007")

sex_by_age <- c(total_by_age ="B01001_001",  #sex_by_age
                male_by_age_total = "B01001_002",
                female_by_age_total = "B01001_026")

demogdata <- get_acs(
  geography = "tract",
  year = 2020,
  variables = c(race,
                education_attainment,
                employment_status,
                sex_by_age,
    hhincome = "B19013_001", # median household income
    poverty = "B17020_002"), # poverty level
    key = credential,
    state = 36,
    county = c(005, 047, 061, 081, 085))

# (2) Tidy Demographic Data
demo_clean <- demogdata %>%
  select(-moe) %>%
  #Writes all column names in lower_case
  rename_all(~str_to_lower(.)) %>%
  #Replaces the white space in column names with underscores
  rename_all(~str_replace_all(., " ", "_")) %>%
  separate(name, c("census_tract","county", "state"), sep = ", ")  %>%
  select(-state) %>%
  pivot_wider(names_from = variable,
            values_from = estimate)

# (3) Missing Data Imputation

# Decided to just impute with the mean-replacing NAs with income average per county
mean_impute <- demo_clean %>%
  select(geoid,county, hhincome) %>%
  group_by(county) %>%
  filter(!is.na(hhincome)) %>%
  summarise(hhincome = mean(hhincome))
mean_impute
## # A tibble: 5 × 2
##   county          hhincome
##   <chr>              <dbl>
## 1 Bronx County      47615.
## 2 Kings County      71143.
## 3 New York County  104136.
## 4 Queens County     77123.
## 5 Richmond County   85720.
demo_clean <- demo_clean %>%
  left_join(mean_impute, by = c("county")) %>%
  mutate(hhincome = ifelse(is.na(hhincome.x), hhincome.y, hhincome.x)) %>%
  select(-hhincome.y, -hhincome.x)
view(demo_clean)

# (4) Mutate more variables

# Calculate the percentage of minorities
demo_clean$rate_of_minorities <- round((demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific)/(demo_clean$white + demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific) * 100, 1) %>%
  replace(., is.na(.), 0)

# Calculate the employment rate
demo_clean$employment_rate <- round((demo_clean$employed)/(demo_clean$employed + demo_clean$unemployed) * 100, 1) %>%
  replace(., is.nan(.), 0)

# (5) Add Geometry and Convert to sf Object
geo <- tracts(
  state = 36,
  county = c(005, 047, 061, 081, 085),
  cb = TRUE
) %>%
  st_transform(crs = 4326) %>%
  select(GEOID, geometry)
##
  |
  |                                                                      |   0%
  |
  |=                                                                     |   1%
  |
  |=                                                                     |   2%
  |
  |==                                                                    |   2%
  |
  |==                                                                    |   3%
  |
  |===                                                                   |   4%
  |
  |===                                                                   |   5%
  |
  |====                                                                  |   6%
  |
  |=====                                                                 |   7%
  |
  |=====                                                                 |   8%
  |
  |======                                                                |   8%
  |
  |======                                                                |   9%
  |
  |=======                                                               |  10%
  |
  |=======                                                               |  11%
  |
  |========                                                              |  11%
  |
  |========                                                              |  12%
  |
  |=========                                                             |  13%
  |
  |==========                                                            |  14%
  |
  |==========                                                            |  15%
  |
  |===========                                                           |  16%
  |
  |============                                                          |  17%
  |
  |============                                                          |  18%
  |
  |=============                                                         |  18%
  |
  |=============                                                         |  19%
  |
  |==============                                                        |  20%
  |
  |==============                                                        |  21%
  |
  |===============                                                       |  21%
  |
  |================                                                      |  22%
  |
  |================                                                      |  23%
  |
  |=================                                                     |  24%
  |
  |==================                                                    |  25%
  |
  |==================                                                    |  26%
  |
  |===================                                                   |  27%
  |
  |====================                                                  |  28%
  |
  |====================                                                  |  29%
  |
  |=====================                                                 |  30%
  |
  |======================                                                |  31%
  |
  |=======================                                               |  32%
  |
  |=======================                                               |  33%
  |
  |========================                                              |  34%
  |
  |=========================                                             |  35%
  |
  |=========================                                             |  36%
  |
  |==========================                                            |  37%
  |
  |===========================                                           |  38%
  |
  |===========================                                           |  39%
  |
  |============================                                          |  40%
  |
  |=============================                                         |  41%
  |
  |==============================                                        |  42%
  |
  |==============================                                        |  43%
  |
  |===============================                                       |  44%
  |
  |================================                                      |  45%
  |
  |================================                                      |  46%
  |
  |=================================                                     |  47%
  |
  |==================================                                    |  48%
  |
  |==================================                                    |  49%
  |
  |===================================                                   |  50%
  |
  |====================================                                  |  51%
  |
  |====================================                                  |  52%
  |
  |=====================================                                 |  52%
  |
  |=====================================                                 |  53%
  |
  |======================================                                |  54%
  |
  |======================================                                |  55%
  |
  |=======================================                               |  55%
  |
  |=======================================                               |  56%
  |
  |========================================                              |  57%
  |
  |========================================                              |  58%
  |
  |=========================================                             |  58%
  |
  |=========================================                             |  59%
  |
  |==========================================                            |  60%
  |
  |===========================================                           |  61%
  |
  |===========================================                           |  62%
  |
  |============================================                          |  62%
  |
  |============================================                          |  63%
  |
  |=============================================                         |  64%
  |
  |=============================================                         |  65%
  |
  |==============================================                        |  65%
  |
  |==============================================                        |  66%
  |
  |===============================================                       |  67%
  |
  |===============================================                       |  68%
  |
  |================================================                      |  68%
  |
  |================================================                      |  69%
  |
  |=================================================                     |  70%
  |
  |==================================================                    |  71%
  |
  |==================================================                    |  72%
  |
  |===================================================                   |  73%
  |
  |====================================================                  |  74%
  |
  |====================================================                  |  75%
  |
  |=====================================================                 |  76%
  |
  |======================================================                |  77%
  |
  |======================================================                |  78%
  |
  |=======================================================               |  79%
  |
  |========================================================              |  80%
  |
  |=========================================================             |  81%
  |
  |=========================================================             |  82%
  |
  |==========================================================            |  83%
  |
  |===========================================================           |  84%
  |
  |===========================================================           |  85%
  |
  |============================================================          |  86%
  |
  |=============================================================         |  87%
  |
  |=============================================================         |  88%
  |
  |==============================================================        |  89%
  |
  |===============================================================       |  90%
  |
  |================================================================      |  91%
  |
  |================================================================      |  92%
  |
  |=================================================================     |  93%
  |
  |==================================================================    |  94%
  |
  |==================================================================    |  95%
  |
  |===================================================================   |  96%
  |
  |====================================================================  |  97%
  |
  |====================================================================  |  98%
  |
  |===================================================================== |  99%
  |
  |======================================================================| 100%
demography_sf <- left_join(
  x = demo_clean,
  y = geo,
  by = c("geoid" = "GEOID")) %>%
  st_as_sf() %>%
  st_transform(crs = 4326) %>%
  mutate(geoid = as.numeric(geoid))

2. Load and Clean Sea Level Rise Maps (2050s 500-year Floodplain)

floodplain500_url <- "https://data.cityofnewyork.us/api/geospatial/qwca-zqw3?method=export&format=Shapefile"

download.file(floodplain500_url,
               destfile = "data/Sea Level Rise Maps (2050s 500-year Floodplain).zip",
               mode = "wb"
)

unzip(zipfile = "data/Sea Level Rise Maps (2050s 500-year Floodplain).zip", exdir = "data/Sea Level Rise Maps (2050s 500-year Floodplain)"
)
file.remove("data/Sea Level Rise Maps (2050s 500-year Floodplain).zip")

#This code is better because the file name changes everytime
floodplain500 <- list.files(path = "data", pattern="\\.shp$", full.names=TRUE)

floodplain500 <- st_read(paste0("data/Sea Level Rise Maps (2050s 500-year Floodplain)/", floodplain500)) %>%
  st_transform(crs = 4326)
class(floodplain500)

floodplain500 %>%
  rename_all(~str_to_lower(.)) %>%
  #Replaces the white space in column names with underscores
  rename_all(~str_replace_all(., " ", "_"))

3. Set Up Models for the Floodplain Data

# joins floodplain with census tract map and codes tracts by whether they are in floodplain
sf_use_s2(FALSE)
floodtracts <- st_join(geo,floodplain500, join = st_intersects, left = FALSE)
flood_demography_sf <- demography_sf %>%
  mutate(flood = ifelse(geoid %in% floodtracts$GEOID, 1, 0)) %>%
  na.omit() %>%
  mutate(flood = factor(flood))

# splits data into training and testing groups (keep both sf and df objects because we need sf objects to plot both training data in EDA and testing data in assessment part, and we need data frame objects to model which do not deal with geometry information)

set.seed(20220422)

flood_split <- initial_split(flood_demography_sf, prop = 0.7, strata = "flood")
flood_train_sf <- training(flood_split)
flood_test_sf <- testing(flood_split)

flood_train_df <- flood_train_sf %>%
  st_centroid() %>%
  mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
  na.omit() %>%
  as_tibble() %>%
  select(-c(geometry, geoid, census_tract, county))

flood_test_df <- flood_test_sf %>%
  st_centroid() %>%
  mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
  na.omit() %>%
  as_tibble() %>%
  select(-geometry, geoid, census_tract, county)
#Visualize the relationship between demographic data and flooding
#Descriptive Analysis
flood_train_sf  %>%
  summary()
##      geoid           census_tract          county           total_by_age
##  Min.   :3.601e+10   Length:1628        Length:1628        Min.   :    0
##  1st Qu.:3.605e+10   Class :character   Class :character   1st Qu.: 2216
##  Median :3.606e+10   Mode  :character   Mode  :character   Median : 3402
##  Mean   :3.606e+10                                         Mean   : 3598
##  3rd Qu.:3.608e+10                                         3rd Qu.: 4669
##  Max.   :3.609e+10                                         Max.   :16600
##  male_by_age_total female_by_age_total     white           black
##  Min.   :   0      Min.   :   0        Min.   :    0   Min.   :   0.0
##  1st Qu.:1069      1st Qu.:1135        1st Qu.:  408   1st Qu.:  53.0
##  Median :1603      Median :1754        Median : 1124   Median : 264.5
##  Mean   :1719      Mean   :1879        Mean   : 1487   Mean   : 855.9
##  3rd Qu.:2250      3rd Qu.:2454        3rd Qu.: 2103   3rd Qu.:1426.0
##  Max.   :7280      Max.   :9320        Max.   :11625   Max.   :7248.0
##  indian_alaska        asian           pacific        no_schooling
##  Min.   :  0.00   Min.   :   0.0   Min.   :  0.00   Min.   :  0.0
##  1st Qu.:  0.00   1st Qu.:  61.0   1st Qu.:  0.00   1st Qu.: 17.0
##  Median :  0.00   Median : 233.5   Median :  0.00   Median : 50.5
##  Mean   : 14.88   Mean   : 519.0   Mean   :  2.32   Mean   : 77.2
##  3rd Qu.: 11.00   3rd Qu.: 679.8   3rd Qu.:  0.00   3rd Qu.:109.0
##  Max.   :971.00   Max.   :5687.0   Max.   :405.00   Max.   :785.0
##  elementary_school middle_school     high_school        bachelor
##  Min.   :  0.00    Min.   :  0.00   Min.   :   0.0   Min.   :   0.0
##  1st Qu.:  0.00    1st Qu.:  0.00   1st Qu.: 259.5   1st Qu.: 246.0
##  Median :  4.00    Median : 22.00   Median : 449.5   Median : 428.5
##  Mean   : 18.67    Mean   : 41.48   Mean   : 505.3   Mean   : 574.9
##  3rd Qu.: 25.00    3rd Qu.: 56.00   3rd Qu.: 696.0   3rd Qu.: 752.2
##  Max.   :417.00    Max.   :798.00   Max.   :2685.0   Max.   :5903.0
##     poverty          employed      unemployed        hhincome
##  Min.   :   0.0   Min.   :   0   Min.   :   0.0   Min.   :  2499
##  1st Qu.: 181.8   1st Qu.:1020   1st Qu.:  45.0   1st Qu.: 50856
##  Median : 401.5   Median :1592   Median :  92.0   Median : 70978
##  Mean   : 617.1   Mean   :1731   Mean   : 122.5   Mean   : 74456
##  3rd Qu.: 816.5   3rd Qu.:2222   3rd Qu.: 168.0   3rd Qu.: 90242
##  Max.   :4350.0   Max.   :9758   Max.   :1129.0   Max.   :250001
##  rate_of_minorities employment_rate           geometry    flood
##  Min.   :  0.00     Min.   :  0.00   MULTIPOLYGON :1628   0:1054
##  1st Qu.: 20.80     1st Qu.: 90.80   epsg:4326    :   0   1: 574
##  Median : 46.60     Median : 94.10   +proj=long...:   0
##  Mean   : 48.52     Mean   : 89.68
##  3rd Qu.: 77.55     3rd Qu.: 96.40
##  Max.   :100.00     Max.   :100.00
# Relationship Between Rate of Employment, Rate of Minorities and Poverty
flood_train_sf  %>%
 # select(c(-geometry, -Lat, -Lon))  %>%
  ggplot(aes(x=employment_rate, y=rate_of_minorities)) +
    geom_point(aes(color = poverty), alpha = 0.4) +
    labs(title = "Relationship Between Rate of Employment, Concentration of Minorities and Poverty \nBased on All Counties in New York City",
       subtitle = "No clear relationship found among the three variables",
       caption = "Data source: 2019 1-year ACS, US Census Bureau",
       x = "Employment Rate (%)",
       y = "Rate of Minorities (%)",
       color = "Poverty"
       ) 

  theme_minimal() 
#Flooding Data on Map 

#Graph to show median household income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
  group_by(geoid, county, census_tract) %>%
  mutate(tooltip = paste(county, census_tract, hhincome, sep = ": "))

plot3 <- ggplot() +
  geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = hhincome,
                      tooltip = tooltip, data_id = hhincome), size = 0.2) +
  geom_sf(data = filter(flood_train_sf, flood == 1), fill = "red", alpha = 0.5) +
  scale_fill_distiller(palette = "Blues", direction = 1, labels = label_dollar()) +
  labs(title = "Median Household Income in New York City",
       subtitle = "By Counties in New York City",
       caption = "Data source: 2019 1-year ACS, US Census Bureau",
       fill = "Household Income") +
theme_void()

girafe(ggobj = plot3) %>%
  girafe_options(opts_hover(css = "fill:orange;"),
                 opts_zoom(max = 10))
#Graph to show rate of minorities in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
  group_by(geoid, county, census_tract) %>%
  mutate(tooltip = paste(county, census_tract, rate_of_minorities, sep = ": "))

P4 <- ggplot() +
  geom_sf_interactive(
    data = flood_train_sf_plot,
    color = "white",
    aes(fill = rate_of_minorities, tooltip = tooltip, data_id = rate_of_minorities),
    size = 0.5) +
  geom_sf(
    data = filter(flood_train_sf, flood == 1),
    fill = "red",
    color = "red",
    alpha = 0.2,
    show.legend = "point"
    ) +
  scale_fill_viridis_c() +
#  scale_fill_distiller(palette = "Blues", direction = 1) +
  labs(
    title = "Concentration of Minorities in New York City",
    subtitle = "Flooding Impact by Rate of Minorities",
    caption = "Data source: 2019 5-year ACS, US Census Bureau",
    fill = "Concentration of Minorities") +
  theme_void()

girafe(ggobj = P4) %>%
  girafe_options(opts_hover(css = "fill:orange;"),
                 opts_zoom(max = 10))
#Graph to show employment rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
  group_by(geoid, county, census_tract) %>%
  mutate(tooltip = paste(county, census_tract, employment_rate, sep = ": "))

P5 <- ggplot() +
  geom_sf_interactive(
    data = flood_train_sf_plot,
    color = "white",
    aes(fill = employment_rate, tooltip = tooltip, data_id = employment_rate), size = 0.2) +
  geom_sf(
    data = filter(flood_train_sf, flood == 1),
    fill = "red", color = "red", alpha = 0.5) +
  scale_fill_distiller(palette = "Blues", direction = 1) +
  labs(
    title = "Rate of Employment in New York City",
    subtitle = "Flooding Impact by Rate of Employment",
    caption = "Data source: 2019 5-year ACS, US Census Bureau",
    fill = "Rate of Employment") +
  theme_void()

girafe(ggobj = P5) %>%
  girafe_options(opts_hover(css = "fill:orange;"),
                 opts_zoom(max = 10))
#Graph to show poverty rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
  group_by(geoid, county, census_tract) %>%
  mutate(tooltip = paste(county, census_tract, poverty, sep = ": "))

P6 <- ggplot() +
  geom_sf_interactive(
    data = flood_train_sf_plot,
    color = "white",
    aes(fill = poverty,tooltip = tooltip, data_id = poverty), size = 0.2) +
  geom_sf(
    data = filter(flood_train_sf, flood == 1),
    fill = "red",
    color = "red",
    alpha = 0.5) +
  labs(
    title = " Poverty in New York City",
    subtitle = "Flooding Impact by Incidences of Poverty",
    caption = "Data source: 2019 5-year ACS, US Census Bureau",
    fill = "Poverty Levels") +
  theme_void()

girafe(ggobj = P6) %>%
  girafe_options(opts_hover(css = "fill:orange;"),
                 opts_zoom(max = 10))
flood_rec <-
  recipe(flood ~ ., data = flood_train_df) %>%
  # center and scale all predictors
  step_center(all_predictors()) %>%
  step_scale(all_predictors())

flood_folds <- vfold_cv(flood_train_df, v = 10, repeats = 1)

# Decision Tree Model
cart_mod <- decision_tree(tree_depth = tune()) %>%
  set_engine(engine = "rpart") %>%
  set_mode(mode = "classification")

cart_grid <- grid_regular(tree_depth(), levels = 10)

cart_wf <- workflow() %>%
  add_recipe(flood_rec) %>%
  add_model(cart_mod)

cart_cv <- cart_wf %>%
  tune_grid(
    resamples = flood_folds,
    grid = cart_grid,
    metrics = metric_set(roc_auc)
  )


# Logistic Model
# run an initial logistic regression to get data for variable importance
floodlogit <- glm(flood ~., data = flood_train_df, family = "binomial")

# displays importance of 10 most important variables
importances <- varImp(floodlogit)
importances %>%
  arrange(desc(Overall)) %>%
  top_n(10)
##                  Overall
## employed        6.385857
## employment_rate 5.805587
## bachelor        4.560309
## unemployed      3.768257
## asian           3.305981
## total_by_age    3.136806
## lon             2.961183
## hhincome        1.727637
## lat             1.636568
## poverty         1.284834
# According to https://stackoverflow.com/questions/47822694/logistic-regression-tuning-parameter-grid-in-r-caret-package/48218280#48218280, there is no tunning parameter for glm model.
logistic_mod <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

logistic_wf <- workflow() %>%
  add_model(logistic_mod) %>%
  add_recipe(flood_rec)

logistic_cv <- logistic_wf %>%
  fit_resamples(
    resample = flood_folds,
    metrics = metric_set(roc_auc)
    )




# KNN Model
flood_knn_mod <-
  nearest_neighbor(neighbors = tune()) %>%
  set_engine(engine = "kknn") %>%
  set_mode(mode = "classification")

knn_grid <- grid_regular(neighbors(), levels = 10)

flood_knn_wf <- workflow() %>%
  add_recipe(flood_rec) %>%
  add_model(flood_knn_mod)

flood_knn_cv <- flood_knn_wf %>%
  tune_grid(
    resample = flood_folds,
    grid = knn_grid,
    metrics = metric_set(roc_auc)
    )
cart_roc <- cart_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(cart_roc_auc = mean(.estimate))

logistic_roc <- logistic_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(logistic_roc_auc = mean(.estimate))

knn_roc <- flood_knn_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(knn_roc_auc = mean(.estimate))

roc_auc <- bind_cols(
  fold = cart_roc$id,
  cart = cart_roc$cart_roc_auc,
  logistic = logistic_roc$logistic_roc_auc,
  knn = knn_roc$knn_roc_auc
) %>%
  pivot_longer(
    cols = 2:4,
    names_to = "model",
    values_to = "roc_auc"
  )

P7 <- roc_auc %>%
  ggplot() +
  geom_boxplot(
    aes(x = model, y = roc_auc)
  ) +
  labs(
    title = "Compare the ROC_AUC across Decision Tree, Logistic, KNN",
    subtitle = "Decision Tree has the highest ROC_AUC",
    x = "Models"
  )

T1 <- roc_auc %>%
  group_by(model) %>%
  summarise(
    roc_auc = mean(roc_auc)
  )

P7 + gridExtra::tableGrob(T1)

# CART is the best model

flood_cart_best <- cart_cv %>%
  select_best(metric = "roc_auc")

flood_final_wf <- cart_wf %>%
  finalize_workflow(
  parameters = flood_cart_best
)

flood_final_fit <- flood_final_wf %>%
  fit(data = flood_train_df)

rpart.plot::rpart.plot(x = flood_final_fit$fit$fit$fit)

flood_assess <- bind_cols(
  flood_test_sf,
  predict(object = flood_final_fit, new_data = flood_test_df, type = "class"),
  predict(object = flood_final_fit, new_data = flood_test_df, type = "prob")
) %>%
  mutate(
    assessment = case_when(
      flood == 1 & .pred_class == 1 ~ "true positive",
      flood == 1 & .pred_class == 0 ~ "false negative",
      flood == 0 & .pred_class == 1 ~ "false positive",
      flood == 0 & .pred_class == 0 ~ "true negative"
    )
  )

P_assess1 <- flood_assess %>%
  roc_curve(truth = flood, estimate = .pred_0) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
           geom_path() +
           geom_abline(lty = 3) +
           coord_equal() +
           theme_minimal()

P_assess2 <- flood_assess %>%
  ggplot() +
  geom_sf(aes(fill = assessment), color = "white", alpha = 0.7)+
  theme_void() +
  labs(
    title = "Assessment of the Finalized Knn Model",
    subtitle = "True postive and true negative take large share of the total",
    fill = ""
  )

P_assess1 + P_assess2

flood_train_sf  %>%
  group_by(flood)%>%
  summarize(n = sum(total_by_age))
## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25609 ymin: 40.4961 xmax: -73.70036 ymax: 40.91771
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 3
##   flood       n                                                         geometry
##   <fct>   <dbl>                                               <MULTIPOLYGON [°]>
## 1 0     3778613 (((-74.16805 40.55272, -74.16793 40.55294, -74.16618 40.55292, …
## 2 1     2078265 (((-74.25237 40.49948, -74.25425 40.50188, -74.25459 40.5023, -…

Sandy Hurricane Damage

1. Load and clean 2012 census Data

# (1) Load demographic data 2012
credential <- Sys.getenv("census_api_key")

lookup_var <- load_variables(2012, "acs5", cache = TRUE)

demo2012 <- get_acs(
  geography = "tract",
  year = 2012,
  variables = c(
    race,
    education_attainment,
    employment_status,
    sex_by_age,
    hhincome = "B19013_001",
    poverty = "C17002_001"),
  state =  36,
  key = credential
)

# (2) Tidy demographic data
demo2012_clean <- demo2012 %>%
  select(-moe, -NAME) %>%
  rename(geoid = GEOID) %>%
  pivot_wider(
    names_from = variable,
    values_from = estimate
  )

# (3) Mutate variables
# The minorities rate
demo2012_clean$rate_of_minorities <- round((demo2012_clean$black + demo2012_clean$asian + demo2012_clean$indian_alaska + demo2012_clean$pacific)/(demo2012_clean$white + demo2012_clean$black + demo2012_clean$asian + demo2012_clean$indian_alaska + demo2012_clean$pacific) * 100, 1) %>%
  replace(., is.na(.), 0)

# The employment rate
demo2012_clean$employment_rate <- round((demo2012_clean$employed)/(demo2012_clean$employed + demo2012_clean$unemployed) * 100, 1) %>%
  replace(., is.nan(.), 0)

# (4) Convert into sf object

# NOTE: As Sandy happened in 2012, we use 2010 geographic data instead of 2020 one.
geo2010 <- tracts(
  state = 36,
  cb = TRUE,
  year = 2010
) %>%
  st_transform(crs = 4326)

geo2010 <- geo2010 %>%
  select(-STATE, -NAME, -LSAD, -CENSUSAREA, -COUNTYFP, -STATEFP) %>%
  clean_names() %>%
  mutate(geoid = substr(geo_id, 10, 20)) %>%
  select(-geo_id)

demo2012_sf <- left_join(
  x = demo2012_clean,
  y = geo2010
) %>%
  st_as_sf() %>%
  st_transform(crs = 4326)

2. Load and Clean Sandy Hurricany Damage Data and Inundation Data

 # Flooding damage
sandy_damage <- read_csv("data/Sandy_Damage_Estimates_by_Block_Group.csv") %>%
  select(GEOID, total_dmg) %>%
  clean_names() %>%
  filter(str_detect(geoid, "^36")) %>%
  mutate(geoid = substr(geoid, 1, 11)) %>%
  group_by(geoid) %>%
  mutate(total_dmg = sum(total_dmg)) %>%
  unique()

# Sandy high water mark (hwm)
sandy_hwm_p <- read_csv("data/FilteredHWMs.csv") %>%
  clean_names() %>%
  select(latitude, longitude, state_name, elev_ft) %>%
  filter(state_name == "NY") %>%
  select(-state_name) %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326)

sf_use_s2(FALSE)
sandy_hwm_ct <- st_join(geo2010, sandy_hwm_p, join = st_contains) %>%
  filter(!is.na(elev_ft))

sandy_dmg_hwm <- inner_join(sandy_hwm_ct, sandy_damage)

sandy_modeling_data <- st_join(sandy_dmg_hwm, demo2012_sf, join = st_equals) %>%
  select(-county.y, -tract.y, -geoid.y) %>%
  rename(county = county.x, tract = tract.x, geoid = geoid.x) %>%
  na.omit()

3. Set Up Models for Sandy Hurricane Data

set.seed(20220422)

sandy_split <- initial_split(sandy_modeling_data, prop = 0.75)

sandy_train_sf <- training(sandy_split)
sandy_test_sf <- testing(sandy_split)

sandy_train_df <- sandy_train_sf %>%
#  st_centroid() %>%
#  mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
#  na.omit() %>%
  as_tibble() %>%
  select(-c(county, tract, geoid, geometry))

sandy_test_df <- sandy_test_sf %>%
#  st_centroid() %>%
#  mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
#  na.omit() %>%
  as_tibble() %>%
  select(-c(county, tract, geoid, geometry))
P8 <- sandy_train_sf %>%
  ggplot() +
  geom_sf(aes(fill = total_dmg), color = "white") +
  scale_fill_gradient(
    low = "yellow",
    high = "red"
  ) +
  labs(
    title = "The Number of Damage Case Caused by Sandy Hurricane in New York State",
    caption = "Sandy Damage Estimates by Block Group, U.S. Department of Housing and Urban Development",
    fill = "Damage Case"
  ) +
  theme_void()


P9 <- sandy_train_sf %>%
  ggplot() +
  geom_sf(aes(fill = elev_ft), color = "white") +
  scale_fill_gradient(
    low = "yellow",
    high = "red"
  ) +
  labs(
    title = "The High Water Mark During the Sandy Hurricane in New York State",
    caption = "USGS Flood Events Viewer (https://stn.wim.usgs.gov/fev/#Sandy)",
    fill = "High Water Mark (Feet)"
  ) +
  theme_void()

P8 / P9

# The elevation of water and the geometric information explains part of the total damage caused by Sandy hurricane.

sandy_EDA <- function(demo_var){

  P <- sandy_train_sf %>%
    rename("estimate" = demo_var) %>%
  ggplot() +
  geom_sf(aes(fill = estimate), color = "white") +
  scale_fill_gradient(
    low = "yellow",
    high = "red"
  ) +
  labs(
    title = paste("The Sandy Hurricane Hit Areas in New York State Differently by", demo_var, sep = " "),
    caption = "Data source: 2012 5-year ACS, US Census Bureau",
    fill = demo_var
  ) +
  theme_void()

  assign(paste("P", demo_var, sep = "_"), P,  envir = .GlobalEnv)
}

map(.x = c("rate_of_minorities", "bachelor", "hhincome", "employment_rate"), .f = sandy_EDA)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

P8 / P_bachelor

P8 / P_employment_rate

P8 / P_hhincome

P8/ P_rate_of_minorities

# According to the map, there is less obvious relationship between demographic attributes and the number of damage case caused by Sandy hurricane.
sandy_folds <- vfold_cv(sandy_train_df, v = 10, repeats = 1)

sandy_rec <- recipe(total_dmg ~ ., data = sandy_train_df) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())




# Random Forest Model
sandy_rf_mod <- rand_forest(mtry = tune(), trees = tune()) %>%
  set_engine("randomForest") %>%
  set_mode("regression")

rf_grid<- sandy_rf_mod %>%
  parameters() %>%
  finalize(sandy_train_df) %>%
  grid_latin_hypercube(size = 10)

sandy_rf_wf <- workflow() %>%
  add_recipe(sandy_rec) %>%
  add_model(sandy_rf_mod)

sandy_rf_cv <- sandy_rf_wf %>%
  tune_grid(
    resample = sandy_folds,
    grid = rf_grid,
    metrics = metric_set(rmse)
  )




## Elastic Net Model
sandy_en_mod <- linear_reg(penalty = tune(), mixture = 0.5) %>%
  set_engine("glmnet") %>%
  set_mode("regression")

en_grid <- grid_regular(penalty(), levels = 10)

sandy_en_wf <- workflow() %>%
  add_recipe(sandy_rec) %>%
  add_model(sandy_en_mod)

sandy_en_cv <- sandy_en_wf %>%
  tune_grid(
    resample = sandy_folds,
    grid = en_grid,
    metrics = metric_set(rmse)
  )




## KNN Model
sandy_knn_mod <- nearest_neighbor(neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("regression")

knn_grid <- grid_regular(neighbors(), levels = 10)

sandy_knn_wf <- workflow() %>%
  add_recipe(sandy_rec) %>%
  add_model(sandy_knn_mod)

sandy_knn_cv <- sandy_knn_wf %>%
  tune_grid(
    resample = sandy_folds,
    grid = knn_grid,
    metrics = metric_set(rmse)
  )
sandy_rf_rmse <- sandy_rf_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(sandy_rf_rmse = mean(.estimate))

sandy_en_rmse <- sandy_en_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(sandy_en_rmse = mean(.estimate))

sandy_knn_rmse <- sandy_knn_cv %>%
  collect_metrics(summarize = FALSE) %>%
  group_by(id) %>%
  summarise(sandy_knn_rmse = mean(.estimate))

sandy_rmse <- bind_cols(
  fold = sandy_rf_rmse$id,
  random_forest = sandy_rf_rmse$sandy_rf_rmse,
  elastic_net = sandy_en_rmse$sandy_en_rmse,
  knn = sandy_knn_rmse$sandy_knn_rmse
) %>%
  pivot_longer(
    cols = 2:4,
    names_to = "model",
    values_to = "rmse"
  )

P10 <- sandy_rmse %>%
  ggplot() +
  geom_boxplot(
    aes(x = model, y = rmse)
  ) +
  labs(
    title = "Compare the RMSE across Random Forest, Elastic Net, KNN",
    subtitle = "Random Forest has the lowest rmse",
    x = "Models"
  )

T2 <- sandy_rmse %>%
  group_by(model) %>%
  summarise(
    rmse = mean(rmse)
  )

P10 + gridExtra::tableGrob(T2)

## Randome Forest is the best model among the three.

sandy_rf_best <- sandy_rf_cv %>%
  select_best(metric = "rmse")

sandy_final_wf <- sandy_rf_wf %>%
  finalize_workflow(
  parameters = sandy_rf_best
)

sandy_final_fit <- sandy_final_wf %>%
  fit(data = sandy_train_df)
sandy_prediction <- sandy_final_fit %>%
  predict(new_data = tibble(sandy_test_df))

sandy_assess <- bind_cols(
  sandy_test_sf,
  predict_dmg = sandy_prediction$.pred
)

tibble(sandy_assess) %>%
  rmse(truth = total_dmg, estimate = predict_dmg)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        464.
tibble(sandy_assess) %>%
  select(total_dmg, predict_dmg) %>%
  pivot_longer(
    cols = 1:2,
    names_to = "true_prediction",
    values_to = "dmg"
  ) %>%
  ggplot() +
  geom_density(aes(x = dmg, group = true_prediction, fill = true_prediction, color = true_prediction), alpha = 0.5) +
  labs(
    title = "The Distribution of True and Predicted Damage Case",
    subtitle = "The model captures some features of the true distribution but fail to match well",
    x = "Damage Case",
    fill = "",
    color = ""
    )

sandy_assess %>%
  mutate(residual = total_dmg - predict_dmg) %>%
  ggplot() +
  geom_sf(aes(fill = residual), color = "white") +
  scale_fill_gradient2(
    low = "red",
    mid = "pink",
    high = "red",
    limits = c(-150, 150),
    guide = guide_colourbar(barheight = unit(4, "cm"))
  ) +
  labs(
    title = "The Residual of The Randome Forest Model in Each Census Tract",
    subtitle = "Put tracts with residuals lower than -100 and higher than 100 out of limites to show \nvariation of residuals more clearly",
    fill = "Residual"
  ) +
  theme_void()